## different dataframes - Tara
hotels <- read_csv("hotel_booking.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   hotel = col_character(),
##   arrival_date_month = col_character(),
##   meal = col_character(),
##   country = col_character(),
##   market_segment = col_character(),
##   distribution_channel = col_character(),
##   reserved_room_type = col_character(),
##   assigned_room_type = col_character(),
##   deposit_type = col_character(),
##   agent = col_character(),
##   company = col_character(),
##   customer_type = col_character(),
##   reservation_status = col_character(),
##   reservation_status_date = col_date(format = "")
## )
## ℹ Use `spec()` for the full column specifications.
hotels_refined = hotels %>% select(hotel, arrival_date_month, arrival_date_day_of_month, arrival_date_year, country ,adr, is_canceled) %>% rename(Hotel_Type = hotel, Country_of_Origin = country, Arrival_month = arrival_date_month, Arrival_date = arrival_date_day_of_month, Arrival_year = arrival_date_year, Average_Daily_Rate = adr)

month_levels <- c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December")
repeat_levels <- c(0, 1)
hotels_refined$Arrival_month =  factor(hotels_refined$Arrival_month, levels = month_levels)

city_hotels = hotels_refined %>% filter(Hotel_Type == "City Hotel", is_canceled == 0) %>% filter(Country_of_Origin != "NULL")
resort_hotels = hotels_refined %>% filter(Hotel_Type == "Resort Hotel", is_canceled == 0) %>% filter(Country_of_Origin != "NULL")


city_hotels_freq = city_hotels%>% filter(Average_Daily_Rate != 0.00)%>% group_by(Country_of_Origin, Arrival_month) %>% summarise(n = n()) %>% mutate(frequency = n/sum(n))
## `summarise()` has grouped output by 'Country_of_Origin'. You can override using the `.groups` argument.
#different dataframes - David
df1 = df %>%
  arrange(arrival_date_month) %>%
  select(lead_time, arrival_date_month) %>%
  group_by(arrival_date_month) %>% 
  summarize(avg_lead_time=mean(lead_time)) %>%
  ungroup()

Creator: TARA GHORPADKAR

Q1: How do seasonal patterns associated with the country of origin affect ADR (Average Daily Rate)?

hotels_refined = hotels %>% select(hotel, arrival_date_month, arrival_date_day_of_month, arrival_date_year, country ,adr, is_canceled) %>% rename(Hotel_Type = hotel, Country_of_Origin = country, Arrival_month = arrival_date_month, Arrival_date = arrival_date_day_of_month, Arrival_year = arrival_date_year, Average_Daily_Rate = adr)

month_levels <- c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December")
repeat_levels <- c(0, 1)
hotels_refined$Arrival_month =  factor(hotels_refined$Arrival_month, levels = month_levels)

city_hotels = hotels_refined %>% filter(Hotel_Type == "City Hotel", is_canceled == 0) %>% filter(Country_of_Origin != "NULL")
resort_hotels = hotels_refined %>% filter(Hotel_Type == "Resort Hotel", is_canceled == 0) %>% filter(Country_of_Origin != "NULL")


city_hotels_freq = city_hotels%>% filter(Average_Daily_Rate != 0.00)%>% group_by(Country_of_Origin, Arrival_month) %>% summarise(n = n()) %>% mutate(frequency = n/sum(n))
## `summarise()` has grouped output by 'Country_of_Origin'. You can override using the `.groups` argument.
resort_hotels_freq = resort_hotels %>% filter(Average_Daily_Rate != 0.00)%>% group_by(Country_of_Origin, Arrival_month) %>% summarise(n = n()) %>% mutate(frequency = n/sum(n))
## `summarise()` has grouped output by 'Country_of_Origin'. You can override using the `.groups` argument.
city_hotels_adr = city_hotels %>% group_by(Country_of_Origin, Arrival_month) %>% arrange(Country_of_Origin, Arrival_month) %>% summarize_at(vars(Average_Daily_Rate), list( ~mean(., na.rm = TRUE) )) %>% rename(Average_Monthly_Rate = Average_Daily_Rate) 

resort_hotels_adr = resort_hotels %>% group_by(Country_of_Origin, Arrival_month) %>% arrange(Country_of_Origin, Arrival_month) %>% summarize_at(vars(Average_Daily_Rate), list( ~mean(., na.rm = TRUE) )) %>% rename(Average_Monthly_Rate = Average_Daily_Rate) 

city_hotels_top = city_hotels %>% group_by(Country_of_Origin) %>% summarise(Num_Customers = n()) %>% arrange(desc(Num_Customers)) %>% head(10)
city_hotels_join = inner_join(city_hotels_adr, city_hotels_top, by = "Country_of_Origin")
city_hotels_join
## # A tibble: 120 x 4
## # Groups:   Country_of_Origin [10]
##    Country_of_Origin Arrival_month Average_Monthly_Rate Num_Customers
##    <chr>             <fct>                        <dbl>         <int>
##  1 BEL               January                       84.8          1479
##  2 BEL               February                      92.1          1479
##  3 BEL               March                         97.1          1479
##  4 BEL               April                        115.           1479
##  5 BEL               May                          132.           1479
##  6 BEL               June                         123.           1479
##  7 BEL               July                         124.           1479
##  8 BEL               August                       122.           1479
##  9 BEL               September                    128.           1479
## 10 BEL               October                      120.           1479
## # … with 110 more rows
resort_hotels_top = resort_hotels %>% group_by(Country_of_Origin) %>% summarise(Num_Customers = n()) %>% arrange(desc(Num_Customers)) %>% head(10)
resort_hotels_join = inner_join(resort_hotels_adr, resort_hotels_top, by = "Country_of_Origin")
resort_hotels_join
## # A tibble: 120 x 4
## # Groups:   Country_of_Origin [10]
##    Country_of_Origin Arrival_month Average_Monthly_Rate Num_Customers
##    <chr>             <fct>                        <dbl>         <int>
##  1 BEL               January                       58.2           389
##  2 BEL               February                      54.8           389
##  3 BEL               March                         50.1           389
##  4 BEL               April                         67.6           389
##  5 BEL               May                           89.0           389
##  6 BEL               June                          98.7           389
##  7 BEL               July                         174.            389
##  8 BEL               August                       195.            389
##  9 BEL               September                     96.5           389
## 10 BEL               October                       68.2           389
## # … with 110 more rows
city_hotels_join %>% ggplot(aes(x = Arrival_month, y = Average_Monthly_Rate, color = Country_of_Origin)) + geom_point() + theme(axis.text.y = element_text(angle = 45)) + theme(axis.text.x = element_text(angle = 50, hjust = 1), panel.background = element_blank(), plot.title = element_text(hjust = 0.5)) + ggtitle("City Hotels")

resort_hotels_join %>% ggplot() + geom_point(mapping = aes(x = Arrival_month, y = Average_Monthly_Rate, color = Country_of_Origin)) + theme(axis.text.y = element_text(angle = 45)) + theme(axis.text.x = element_text(angle = 50, hjust = 1), panel.background = element_blank(), plot.title = element_text(hjust = 0.5)) + ggtitle("Resort Hotels")

city_hotels_freq = city_hotels_freq %>% filter(n > 80, frequency < 0.6) 
city_hotels_freq %>% ggplot() + geom_point(mapping = aes(x = Arrival_month, y = frequency, color = Country_of_Origin)) + theme(axis.text.y = element_text(angle = 45)) + theme(axis.text.x = element_text(angle = 50, hjust = 1), panel.background = element_blank(), plot.title = element_text(hjust = 0.5)) + ggtitle("City Hotel")

resort_hotels_freq = resort_hotels_freq %>% filter(n > 80) 
resort_hotels_freq %>% ggplot() + geom_point(mapping = aes(x = Arrival_month, y = frequency, color = Country_of_Origin)) + theme(axis.text.y = element_text(angle = 45)) + theme(axis.text.x = element_text(angle = 50, hjust = 1), panel.background = element_blank(), plot.title = element_text(hjust = 0.5))+ ggtitle("Resort Hotel")

city_hotels2 = hotels %>% filter(hotel == "City Hotel", is_canceled == 0) %>% select(arrival_date_year, arrival_date_month, arrival_date_day_of_month,stays_in_week_nights, stays_in_weekend_nights, adr) %>% mutate(length_of_stay = stays_in_week_nights + stays_in_weekend_nights) %>% arrange(arrival_date_year, arrival_date_month, arrival_date_day_of_month)



city_hotels2$arrival_date_month = as.integer(factor(city_hotels2$arrival_date_month, levels = month.name))


city_hotels2 = city_hotels2 %>% select(arrival_date_year, arrival_date_month, arrival_date_day_of_month, length_of_stay, adr) %>% unite("arrival_date", c("arrival_date_year", "arrival_date_month", "arrival_date_day_of_month"), sep = "/") %>% filter(adr != 0.00)

city_hotels2$arrival_date <- as.Date(city_hotels2$arrival_date)
city_hotels2 = city_hotels2 %>% arrange(arrival_date)  %>% group_by(arrival_date) %>% summarise(avg_length_of_stay = mean(length_of_stay), avg_adr = mean(adr))


city_hotels2 %>% ggplot() + geom_point(mapping = aes(x = arrival_date, y = avg_adr, color = avg_length_of_stay)) + ggtitle("City Hotels")

city_hotels2
## # A tibble: 785 x 3
##    arrival_date avg_length_of_stay avg_adr
##    <date>                    <dbl>   <dbl>
##  1 2015-07-01                 2.09    96.6
##  2 2015-07-02                 3       58.7
##  3 2015-07-03                 2.5     74.5
##  4 2015-07-04                 4       63.8
##  5 2015-07-06                 1       66.1
##  6 2015-07-07                 6       69.2
##  7 2015-07-08                 2.27    64.8
##  8 2015-07-09                 5       58.9
##  9 2015-07-10                 4.5     93.3
## 10 2015-07-11                 2.74    87.1
## # … with 775 more rows
resort_hotels2 = hotels %>% filter(hotel == "Resort Hotel", is_canceled == 0) %>% select(arrival_date_year, arrival_date_month, arrival_date_day_of_month,stays_in_week_nights, stays_in_weekend_nights, adr) %>% mutate(length_of_stay = stays_in_week_nights + stays_in_weekend_nights) %>% arrange(arrival_date_year, arrival_date_month, arrival_date_day_of_month)



resort_hotels2$arrival_date_month = as.integer(factor(resort_hotels2$arrival_date_month, levels = month.name))


resort_hotels2 = resort_hotels2 %>% select(arrival_date_year, arrival_date_month, arrival_date_day_of_month, length_of_stay, adr) %>% unite("arrival_date", c("arrival_date_year", "arrival_date_month", "arrival_date_day_of_month"), sep = "/") %>% filter(adr != 0.00)

resort_hotels2$arrival_date <- as.Date(resort_hotels2$arrival_date)
resort_hotels2 = resort_hotels2 %>% arrange(arrival_date)  %>% group_by(arrival_date) %>% summarise(avg_length_of_stay = mean(length_of_stay), avg_adr = mean(adr))


resort_hotels2 %>% ggplot() + geom_point(mapping = aes(x = arrival_date, y = avg_adr, color = avg_length_of_stay)) + ggtitle("Resort Hotels")

resort_hotels2
## # A tibble: 793 x 3
##    arrival_date avg_length_of_stay avg_adr
##    <date>                    <dbl>   <dbl>
##  1 2015-07-01                 4.94    93.2
##  2 2015-07-02                 5.77   100. 
##  3 2015-07-03                 5      109. 
##  4 2015-07-04                 5.77    97.6
##  5 2015-07-05                 6.11   108. 
##  6 2015-07-06                 6.26   112. 
##  7 2015-07-07                 5.08   121. 
##  8 2015-07-08                 5.95   105. 
##  9 2015-07-09                 4.43   101. 
## 10 2015-07-10                 5.56   124. 
## # … with 783 more rows

Q2: How does the average daily rate (ADR) change over time?

model_data_city = city_hotels2
start_date = as.Date("2015-07-01")
model_data_city$arrival_date <- as.numeric(difftime(model_data_city$arrival_date, start_date, unit = "days"))
NumDays.city <- model_data_city$arrival_date
xc <- cos(2*pi*NumDays.city/366)
xs <- sin(2*pi*NumDays.city/366)
fit.lm <- lm(model_data_city$avg_adr ~ xc + xs)
fit <- fitted(fit.lm)
summary(fit.lm)
## 
## Call:
## lm(formula = model_data_city$avg_adr ~ xc + xs)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -62.408  -9.783  -0.760  11.136  75.582 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 103.2384     0.6420 160.813   <2e-16 ***
## xc           17.8297     0.8970  19.877   <2e-16 ***
## xs            0.7110     0.9177   0.775    0.439    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.92 on 782 degrees of freedom
## Multiple R-squared:  0.3373, Adjusted R-squared:  0.3356 
## F-statistic:   199 on 2 and 782 DF,  p-value: < 2.2e-16
pred <- predict(fit.lm, newdata = data.frame(NumDays.city = NumDays.city))
plot(model_data_city$avg_adr ~ NumDays.city, data= model_data_city, xlim=c(1, 900))
lines(fit, col="red")
lines(NumDays.city, pred, col="blue")

NumDays.city <- model_data_city$arrival_date
Avg_ADR.city <- model_data_city$avg_adr
fit.lm <- lm(Avg_ADR.city ~ poly(NumDays.city, 6, raw=TRUE))
fit <- fitted(fit.lm)
summary(fit.lm)
## 
## Call:
## lm(formula = Avg_ADR.city ~ poly(NumDays.city, 6, raw = TRUE))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -28.554  -7.989  -1.215   6.440  78.009 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                         5.568e+01  3.466e+00   16.06   <2e-16 ***
## poly(NumDays.city, 6, raw = TRUE)1  1.642e+00  1.160e-01   14.15   <2e-16 ***
## poly(NumDays.city, 6, raw = TRUE)2 -2.197e-02  1.254e-03  -17.52   <2e-16 ***
## poly(NumDays.city, 6, raw = TRUE)3  1.172e-04  5.898e-06   19.87   <2e-16 ***
## poly(NumDays.city, 6, raw = TRUE)4 -2.860e-07  1.348e-08  -21.22   <2e-16 ***
## poly(NumDays.city, 6, raw = TRUE)5  3.225e-10  1.473e-11   21.89   <2e-16 ***
## poly(NumDays.city, 6, raw = TRUE)6 -1.363e-13  6.166e-15  -22.11   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.81 on 778 degrees of freedom
## Multiple R-squared:  0.6628, Adjusted R-squared:  0.6602 
## F-statistic: 254.9 on 6 and 778 DF,  p-value: < 2.2e-16
pred <- predict(fit.lm, newdata = data.frame(NumDays.city = NumDays.city))
plot(Avg_ADR.city ~ NumDays.city, data= model_data_city, xlim=c(1, 900))
lines(fit, col="red")
lines(NumDays.city, pred, col="blue")

Interpreter: SIDDHARTHA VANAM

Q1: Does a correlation exist between time on waitlist and a greater ADR?

df %>% select(adr, reservation_status, days_in_waiting_list, reserved_room_type, assigned_room_type, market_segment, country_name, stays_in_week_nights, stays_in_weekend_nights, arrival_date_year, lead_time, hotel) %>% 
  filter(days_in_waiting_list < 300, adr <200, days_in_waiting_list != 0) %>% 
  ggplot(aes(days_in_waiting_list, adr)) + geom_point(aes(color=reserved_room_type, alpha =0.4)) + geom_smooth(method="lm", se=T) + facet_wrap(~hotel, nrow=2) +labs(title = "ADR vs. Days on the Waitlist",
       x = "Days on the Waitlist",
       y = "ADR")+
  theme(panel.background = element_blank(),
        plot.title = element_text(hjust = 0.5)) + scale_alpha(guide = 'none')
## `geom_smooth()` using formula 'y ~ x'

Q2: Which customer types tend to cancel non-refund bookings?

df[df$deposit_type== "Non Refund", ] %>% group_by(customer_type, is_canceled ) %>% mutate(count = n()) %>% 
  ggplot(aes(x=reorder(customer_type, -count), y=count)) + geom_bar(stat = "identity", position = position_dodge()) + theme_bw() + 
  labs(title = "Cancelled Non-Refund bookings by Customer Type",
       x = "Customer Type",
       y = "Count") +
  theme(panel.background = element_blank(),
        plot.title = element_text(hjust = 0.5))

Orator: DAVID SNIDER

Q1: How does lead time vary by month?

  • Maintain low prices during busy months and use waitlist to book excess customers in non-busy months
  • Raise prices during busy months to raise more revenue
ggplot(df1) + 
  geom_line(aes(
    x=arrival_date_month,
    y=avg_lead_time,
    group=1
  )) + 
  ggtitle("Average Lead Time vs Arrival Month") + 
  xlab("Arrival Month") + 
  ylab("Average Lead Time (Days)") + 
  theme(axis.text.x=element_text(angle=30, vjust=1,),
        plot.title=element_text(hjust=0.5))

Q2: Which variables best predict cancellation?

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
leadgraph = df %>% 
  group_by(as.logical(is_canceled)) %>%
  summarize(avg_lead_time = mean(lead_time)) %>% 
  ungroup() %>%
  rename(value=avg_lead_time)
leadgraph$key=rep("Average Lead Time", 2)

prevgraph = df %>%
  group_by(as.logical(is_canceled)) %>%
  summarize(avg_prev_canc = mean(previous_cancellations)) %>% 
  ungroup() %>%
  rename(value=avg_prev_canc)
prevgraph$key=rep("Average Previous Cancellations", 2)

prevuncancgraph = df %>%
  group_by(as.logical(is_canceled)) %>%
  summarize(avg_prev_uncanc = mean(previous_bookings_not_canceled)) %>% 
  ungroup() %>%
  rename(value=avg_prev_uncanc)
prevuncancgraph$key=rep("Average Previous Bookings Not Canceled", 2)

adrgraph = df %>%
  group_by(as.logical(is_canceled)) %>%
  summarize(avg_adr = mean(adr)) %>% 
  ungroup() %>%
  rename(value=avg_adr)
adrgraph$key=rep("Average ADR", 2)

fullgraph = bind_rows(leadgraph, prevgraph, prevuncancgraph, adrgraph)

ggplot(fullgraph, aes(x=`as.logical(is_canceled)`, y=value)) + 
  geom_col() + 
  facet_wrap(~key, scales = "free")

pvals = c()

pvals = c(pvals, t.test(df$lead_time~as.logical(df$is_canceled))$p.value[[1]])
pvals = c(pvals, t.test(df$previous_cancellations~as.logical(df$is_canceled))$p.value[[1]])
pvals = c(pvals, t.test(df$previous_bookings_not_canceled~as.logical(df$is_canceled))$p.value[[1]])
pvals = c(pvals, t.test(df$adr~as.logical(df$is_canceled))$p.value[[1]])

vars = c("Average Lead Time",
         "Average Previous Cancellations",
         "Average Previous Bookings Not Canceled",
         "Average ADR")

pvaltable = data.frame(Variables = vars, "P values" = pvals)
pvaltable$P.values = as.character(pvaltable$P.values)
to_print = pvaltable %>% 
  xtable(align="ccc")
print(to_print, 
      "html", 
      html.table.attributes="align='center',
                             rules='rows',
                             width=50%,
                             frame='hsides',
                             border-spacing=5px"
      )
## <!-- html table generated in R 4.1.0 by xtable 1.8-4 package -->
## <!-- Thu Jul 29 23:35:10 2021 -->
## <table align='center',
##                              rules='rows',
##                              width=50%,
##                              frame='hsides',
##                              border-spacing=5px>
## <tr> <th>  </th> <th> Variables </th> <th> P.values </th>  </tr>
##   <tr> <td align="center"> 1 </td> <td align="center"> Average Lead Time </td> <td align="center"> 0 </td> </tr>
##   <tr> <td align="center"> 2 </td> <td align="center"> Average Previous Cancellations </td> <td align="center"> 3.44419715019362e-196 </td> </tr>
##   <tr> <td align="center"> 3 </td> <td align="center"> Average Previous Bookings Not Canceled </td> <td align="center"> 5.87710671289984e-129 </td> </tr>
##   <tr> <td align="center"> 4 </td> <td align="center"> Average ADR </td> <td align="center"> 9.7601257426029e-59 </td> </tr>
##    </table>
hotelgraph = df %>%
  group_by(hotel) %>%
  summarize(
    n=n(),
    num_canceled = sum(is_canceled),
    prop_canceled = mean(is_canceled)
  ) %>%
  ungroup() %>%
  rename(value=hotel)
hotelgraph$key=rep("Hotel", 2)

repeatgraph = df %>%
  group_by(as.logical(is_repeated_guest)) %>%
  summarize(
    n=n(),
    num_canceled = sum(is_canceled),
    prop_canceled = mean(is_canceled)
  ) %>%
  ungroup() %>%
  rename(value=`as.logical(is_repeated_guest)`)
repeatgraph$key=rep("Is Repeated Guest?", 2)
repeatgraph$value=as.character(repeatgraph$value)

fullgraph2 = bind_rows(hotelgraph, repeatgraph)

ggplot(fullgraph2, aes(x=value, y=prop_canceled)) + 
  geom_col() + 
  facet_wrap(~key, scales="free")

detach("package:MASS", unload = TRUE)

Deliverer: WALKER BURGIN

Q1: Are bookings by travel agents and tour operators (TA/TO) more likely to get cancelled than direct bookings?

hotel_bookings <- read_csv("hotel_booking.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   hotel = col_character(),
##   arrival_date_month = col_character(),
##   meal = col_character(),
##   country = col_character(),
##   market_segment = col_character(),
##   distribution_channel = col_character(),
##   reserved_room_type = col_character(),
##   assigned_room_type = col_character(),
##   deposit_type = col_character(),
##   agent = col_character(),
##   company = col_character(),
##   customer_type = col_character(),
##   reservation_status = col_character(),
##   reservation_status_date = col_date(format = "")
## )
## ℹ Use `spec()` for the full column specifications.
pop_Mean_canceled = mean(hotel_bookings$is_canceled)
pop_Mean_canceled #our population sample
## [1] 0.3704163
hotel_bookings$cumul_cancellations <- hotel_bookings$previous_cancellations+hotel_bookings$is_canceled
hotel_bookings$cumul_bookings <- hotel_bookings$previous_bookings_not_canceled+1-hotel_bookings$is_canceled
hotel_bookings$customer_cancellation_rate_total <- hotel_bookings$cumul_cancellations/(hotel_bookings$cumul_cancellations +hotel_bookings$cumul_bookings)
fcustomer_cancellation_rate_total <-tibble(
  filter(hotel_bookings,customer_cancellation_rate_total>0&customer_cancellation_rate_total<1)
)
fcustomer_cancellation_rate_total$cancellation_rate <- fcustomer_cancellation_rate_total$cumul_cancellations/(fcustomer_cancellation_rate_total$cumul_bookings+fcustomer_cancellation_rate_total$cumul_cancellations)
ggplot(data=fcustomer_cancellation_rate_total)+geom_histogram(mapping=aes(cancellation_rate),fill="red",color="white")+geom_vline(xintercept=pop_Mean_canceled,color="red")+
labs(title = "Cancellation Rates by Customer",
       x = "Cancellation Rate",
       y = "Frequency") +
  theme(panel.background = element_blank(),
        plot.title = element_text(hjust = 0.5))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

agentDist <- filter(hotel_bookings,hotel_bookings$distribution_channel=="TA/TO")
agentDist$cancellation_rate <- agentDist$cumul_cancellations /(agentDist$cumul_bookings+agentDist$cumul_cancellations)
agent_sampl_Mean <- mean(agentDist$is_canceled)
agent_sampl_Mean #agents-only sample
## [1] 0.4102585
table <- agentDist %>%
  group_by(agent) %>%
  summarize(mean = mean(as.integer(is_canceled)), sum = sum(as.integer(is_canceled)))
tableB <- filter(table,mean>0&mean<1)
ggplot(data=tableB)+geom_histogram(mapping=aes(mean),color="white",fill="purple")+geom_vline(xintercept=agent_sampl_Mean,color="red")+geom_vline(xintercept=pop_Mean_canceled,color="purple")+
labs(title = "Cancellation Rate, per Agent",
       x = "Cancellation Rate",
       y = "Frequency") +
  theme(panel.background = element_blank(),
        plot.title = element_text(hjust = 0.5))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

nonAgentDist <- tibble(
  filter(hotel_bookings,agent=="NULL"&customer_cancellation_rate_total>0&customer_cancellation_rate_total<1)
)
nonAgentMean <- mean(nonAgentDist$customer_cancellation_rate_total)

random_sample_rates <- tibble(tableB)
random_sample_rates$randPop <- sample(fcustomer_cancellation_rate_total$cancellation_rate,168)
random_sample_rates$randNon <- sample(nonAgentDist$customer_cancellation_rate_total,168)

random_sample_rates$randNon <- sample(nonAgentDist$customer_cancellation_rate_total,168)
ggplot(data=random_sample_rates)+geom_histogram(mapping=aes(tableB$mean),color="white",fill="red")+geom_histogram(mapping=aes(randPop),color="white",fill="purple")+geom_histogram(mapping=aes(randNon),color="white",fill="blue")+geom_vline(xintercept=agent_sampl_Mean,color="red")+geom_vline(xintercept=pop_Mean_canceled,color="purple")+geom_vline(xintercept=nonAgentMean,color="blue")+labs(title = "Cancellation Rate, per Agent (red), per non-Agent (blue), and population (purple)",
       x = "Cancellation Rate",
       y = "Frequency") + theme(panel.background = element_blank())
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Therefore, our data implies that agents (represented in red) consistently cancel more nonAgents (blue) and the general population of both (purple).

Q2: How does the pattern of people staying affect the ADR? For example, does a greater amount of children in the hotel negatively affect the ADR?

adr_pop_Mean = mean(hotel_bookings$adr)
# mean for the population is 101.8311
adr_revised<-hotel_bookings %>% filter(adr != 5400 & adr != 510)
ggplot(data=adr_revised)+geom_histogram(mapping=aes(adr),fill="blue",color="white")+geom_vline(xintercept = adr_pop_Mean,color="blue")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

child <- as.integer(hotel_bookings$children)
hotel_bookings$childCount = child
withoutChild<-tibble(filter(hotel_bookings,child=="0"&adr != 5400 & adr != 510))
adr_sampl_Mean = mean(withoutChild$adr)
# mean for the childless is 97.47
ggplot(data=withoutChild)+geom_histogram(mapping=aes(adr),color="white",fill="red")+geom_vline(xintercept = adr_sampl_Mean,color="red")+geom_vline(xintercept = adr_pop_Mean,color="blue")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Both histograms appear extremely similar, but as our means are significantly different given our dataset, we can conclude that, on average, groups that have children have a lower ADR than those without.

Follow-up Questions

New Questions Based Off Initial Investigation

  • Q1: Can we model hotel average daily rate over a time series sinusoidally or parabolically?
  • Q2: Which model best predicts cancellation?
  • Q3: When is the proportion of cancellations the highest? (Maybe optimize adr with the least number of cancellations?)
  • Q4: Can certain combinations of arrival month and country of origin negatively weigh on the adr vs other combinations of arrival month and country of origin?

Investigation of Follow-up Questions

Our group decided to investigate Q1 and Q2 in further detail.

Can we model hotel average daily rate over a time series sinusoidally or parabolically?

model_data_resort = resort_hotels2
model_data_resort$arrival_date <- as.numeric(difftime(model_data_resort$arrival_date, start_date, unit = "days"))
NumDays.resort <- model_data_resort$arrival_date
xc <- cos(2*pi*NumDays.resort/366)
xs <- sin(2*pi*NumDays.resort/366)
fit.lm <- lm(model_data_resort$avg_adr ~ xc + xs)
fit <- fitted(fit.lm)
summary(fit.lm)
## 
## Call:
## lm(formula = model_data_resort$avg_adr ~ xc + xs)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -65.645 -21.414  -1.789  14.524 155.845 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   89.456      1.062   84.24   <2e-16 ***
## xc            49.277      1.478   33.34   <2e-16 ***
## xs            21.450      1.523   14.08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 29.75 on 790 degrees of freedom
## Multiple R-squared:  0.6327, Adjusted R-squared:  0.6317 
## F-statistic: 680.3 on 2 and 790 DF,  p-value: < 2.2e-16
pred <- predict(fit.lm, newdata = data.frame(NumDays.resort = NumDays.resort))
plot(model_data_resort$avg_adr ~ NumDays.resort, data= model_data_resort, xlim=c(1, 900))
lines(fit, col="red")
lines(NumDays.resort, pred, col="blue")

Avg_ADR.resort <- model_data_resort$avg_adr
fit.lm <- lm(Avg_ADR.resort ~ poly(NumDays.resort, 6, raw=TRUE))
fit <- fitted(fit.lm)
summary(fit.lm)
## 
## Call:
## lm(formula = Avg_ADR.resort ~ poly(NumDays.resort, 6, raw = TRUE))
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -52.36 -17.84  -1.92  12.20 137.53 
## 
## Coefficients:
##                                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                           1.232e+02  6.288e+00  19.590  < 2e-16 ***
## poly(NumDays.resort, 6, raw = TRUE)1  1.290e+00  2.207e-01   5.844 7.45e-09 ***
## poly(NumDays.resort, 6, raw = TRUE)2 -3.257e-02  2.434e-03 -13.379  < 2e-16 ***
## poly(NumDays.resort, 6, raw = TRUE)3  2.013e-04  1.156e-05  17.418  < 2e-16 ***
## poly(NumDays.resort, 6, raw = TRUE)4 -5.151e-07  2.656e-08 -19.390  < 2e-16 ***
## poly(NumDays.resort, 6, raw = TRUE)5  5.862e-10  2.913e-11  20.123  < 2e-16 ***
## poly(NumDays.resort, 6, raw = TRUE)6 -2.454e-13  1.222e-14 -20.082  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 25.68 on 786 degrees of freedom
## Multiple R-squared:  0.7277, Adjusted R-squared:  0.7256 
## F-statistic: 350.1 on 6 and 786 DF,  p-value: < 2.2e-16
pred <- predict(fit.lm, newdata = data.frame(NumDays.resort = NumDays.resort))
plot(Avg_ADR.resort ~ NumDays.resort, data= model_data_resort, xlim=c(1, 900))
lines(fit, col="red")
lines(NumDays.resort, pred, col="blue")

Which model best predicts cancellation?

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
df_log = df %>%
  dplyr::select(hotel,
         is_canceled,
         lead_time,
         previous_cancellations,
         previous_bookings_not_canceled,
         adr,
         is_repeated_guest
         ) %>% mutate(id=row_number())

set.seed(216)
df_train=df_log %>% 
  sample_frac(0.80)
df_test=anti_join(df_log, df_train, by='id')
df_train= df_train %>%
  subset(select = -id)
df_test= df_test %>%
  subset(select= -id)

Model 1: Logistic

model1 = glm(
  is_canceled~., 
  family="binomial",
  data=df_train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
options("scipen"=100, "digits"=4)
summary(model1)
## 
## Call:
## glm(formula = is_canceled ~ ., family = "binomial", data = df_train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -8.490  -0.893  -0.686   1.190   6.578  
## 
## Coefficients:
##                                  Estimate Std. Error z value
## (Intercept)                    -1.3828643  0.0210942   -65.6
## hotelResort Hotel              -0.4658058  0.0158784   -29.3
## lead_time                       0.0049329  0.0000715    69.0
## previous_cancellations          2.9149599  0.0577364    50.5
## previous_bookings_not_canceled -0.6372580  0.0288014   -22.1
## adr                             0.0037409  0.0001530    24.4
## is_repeated_guest              -1.0641241  0.0877746   -12.1
##                                           Pr(>|z|)    
## (Intercept)                    <0.0000000000000002 ***
## hotelResort Hotel              <0.0000000000000002 ***
## lead_time                      <0.0000000000000002 ***
## previous_cancellations         <0.0000000000000002 ***
## previous_bookings_not_canceled <0.0000000000000002 ***
## adr                            <0.0000000000000002 ***
## is_repeated_guest              <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 125985  on 95511  degrees of freedom
## Residual deviance: 110568  on 95505  degrees of freedom
## AIC: 110582
## 
## Number of Fisher Scoring iterations: 7
df_test = df_test %>% add_predictions(
  model1,
  var="predicted_canc1") %>%
  mutate(predicted_canc1 = ifelse(predicted_canc1 > 0.5,1,0))

Model 2: Stepwise logistic

model2 = stepAIC(model1)
## Start:  AIC=110582
## is_canceled ~ hotel + lead_time + previous_cancellations + previous_bookings_not_canceled + 
##     adr + is_repeated_guest
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##                                  Df Deviance    AIC
## <none>                                110568 110582
## - is_repeated_guest               1   110769 110781
## - adr                             1   111163 111175
## - hotel                           1   111490 111502
## - previous_bookings_not_canceled  1   112027 112039
## - lead_time                       1   115811 115823
## - previous_cancellations          1   115816 115828
df_test = df_test %>% add_predictions(
  model2,
  var="predicted_canc2") %>%
  mutate(predicted_canc2 = ifelse(predicted_canc2 > 0.5,1,0))

Model 3: Logistic with twofold interaction

model3 = glm(
  is_canceled~.^2, 
  family="binomial",
  data=df_train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
tidy(model3)[,c("term", "estimate", "p.value")]
## # A tibble: 22 x 3
##    term                                             estimate   p.value
##    <chr>                                               <dbl>     <dbl>
##  1 (Intercept)                                      -1.45    0        
##  2 hotelResort Hotel                                -0.406   2.36e- 22
##  3 lead_time                                         0.00585 9.32e-208
##  4 previous_cancellations                            2.90    1.29e- 25
##  5 previous_bookings_not_canceled                   -0.755   4.74e- 10
##  6 adr                                               0.00404 1.84e- 51
##  7 is_repeated_guest                                -0.851   3.62e-  8
##  8 hotelResort Hotel:lead_time                      -0.00119 5.70e- 14
##  9 hotelResort Hotel:previous_cancellations          0.461   1.36e-  2
## 10 hotelResort Hotel:previous_bookings_not_canceled -0.0386  6.32e-  1
## # … with 12 more rows
df_test = df_test %>% add_predictions(
  model3,
  var="predicted_canc3") %>%
  mutate(predicted_canc3 = ifelse(predicted_canc3 > 0.5,1,0))

Model4: k-NN, k=5

#standardize dataset
standardize = function(vector) {
  return(sd(vector)*vector + 
           mean(vector)
              )
}

df_knn = df_test %>%
  mutate(
    previous_cancellations=standardize(previous_cancellations),
    previous_bookings_not_canceled=standardize(previous_bookings_not_canceled),
    hotel=ifelse(hotel=="Resort Hotel", 1, 0),
    hotel=standardize(hotel),
    lead_time=standardize(lead_time),
    adr=standardize(adr)
    ) %>% 
  mutate(predicted_canc4=knn(
    train=dplyr::select(
      df_train,
      lead_time,
      adr,
      previous_cancellations
      ),
    test=dplyr::select(
      df_test,
      lead_time,
      adr,
      previous_cancellations
    ),
    cl=factor(
      df_train$is_canceled,
      levels=c(0,1),
      labels=c("0","1")
    ),
    k=5)
  ) %>%
  dplyr::select(predicted_canc4)

df_test = bind_cols(df_test, df_knn) %>%
  mutate(predicted_canc4 = as.integer(predicted_canc4)-1)

Metrics

#input df_test$predicted_canc1, or 2, etc.
sensitivity = function(predicted_canc) {
  return(
    sum(df_test$is_canceled & predicted_canc) /
      sum(df_test$is_canceled)
  )
}

specificity = function(predicted_canc) {
  return(
    sum(!df_test$is_canceled & !predicted_canc) /
      sum(!df_test$is_canceled)
  )
}

accuracy = function(predicted_canc) {
  (
  sum(df_test$is_canceled & predicted_canc) + 
    sum(!df_test$is_canceled & !predicted_canc)
  ) / nrow(df_test)
}

metrics_table = tribble(
  ~Model, ~sensitivity, ~specificity, ~accuracy,
  1, sensitivity(df_test$predicted_canc1), specificity(df_test$predicted_canc1), accuracy(df_test$predicted_canc1),
  2, sensitivity(df_test$predicted_canc2), specificity(df_test$predicted_canc2), accuracy(df_test$predicted_canc2),
  3, sensitivity(df_test$predicted_canc3), specificity(df_test$predicted_canc3), accuracy(df_test$predicted_canc3),
  4, sensitivity(df_test$predicted_canc4), specificity(df_test$predicted_canc4), accuracy(df_test$predicted_canc4)
)

to_print1 = metrics_table %>% 
  xtable(align="ccccc")
print(to_print1, 
      "html", 
      html.table.attributes="align='center',
                             rules='rows',
                             width=50%,
                             frame='hsides',
                             border-spacing=5px"
)
Model sensitivity specificity accuracy
1 1.00 0.19 0.98 0.69
2 2.00 0.19 0.98 0.69
3 3.00 0.19 0.98 0.69
4 4.00 0.60 0.85 0.76
detach("package:MASS", unload = TRUE)

Summary

Our team looked at hotel booking data. Customers visit more often and rates are higher during the summer. However, customers from some countries, particularly those near Portugal, arrive more frequently during winter months. Longer stays are associated with a lower average daily rate, but the majority of stays range from 2-3 days for both hotels. More than the average length of stay, average daily rate seems to oscillate with time of month more often, and is especially present in the resort hotel. However, stays of 1 day are usually plotted higher on the y axis for both graphs compared to stays of 3 days or longer on the same x axis location (meaning on the same day). Our team next reviewed which percentage of the bookings as a whole are canceled: 37%. When we produce the same observation for an agents-only sample, we find that among agents, 41% of bookings are canceled. Comparing the data, we found that agents consistently cancel more non-agents and the general population of both. For the next question, we created a statistic for the population ADR mean, and then filtered the customers without children into a separate data frame to analyze if significant differences exist between the two groups. On average, we found that groups that have children have a lower ADR than those without. For our first follow up question, we wanted to see the difference between a sinusoidal and polynomial regression for the relationship between arrival date and average daily rate. We used data previously arranged to plot arrival time and average daily rate in a time series. In order to find this model, We needed to tweak the arrival date data. Instead of showing the actual arrival date, the x axis now indicated the number of days since the first recorded date of a customer. For both city and resort data, the sinusoidal curve seems to be a better fit. The R^2 values for the polynomial curve are better for both the resort and city data, as they are higher than the sinusoidal R^2 values. The polynomial used was a sextic function, as all graphs seemed to have 5 inflection points. The oscillations in the graph suggested that this data could be plotted using these functions. The R^2 value for resort data sinusoidal was 0.6317, and the R^2 value for resort data polynomial was 0.7256. The R^2 value for city data sinusoidal was 0.3356, and the R^2 value for city data polynomial was 0.6602, indicating that the sextic function was a better fit for both data. This indicates that while the data oscillates, there are certain arrival dates that strongly forecast lower average daily rates and certain arrival dates that strongly forecast higher average daily rates with a peak in the average daily rate ranging around July of each year. For our next question, we asked which model best predicts cancellation. We made 4 models: Model 1 is logistic regression, Model 2 stepwise logistic regression, Model 3 logistic regression with two-variable interaction, and Model 4 k-NN, with k=5. A table of with metrics measuring the success of these models is above. The k-NN model was the most accurate and the most sensitive, but it was the least specific, compared against the other logistic regression models. To determine whether sensitivity or specificity matters more, we need to consider the practical context of our model. We were thinking our model could be used to help hotel managers create a “fast-track” waitlist of people to replace the individuals who are more likely to cancel. As long as the hotels don’t actually guarantee rooms to folks on the fast-track waitlist, it should not matter whether managers wrongly predict cancellation. That is, it should not matter how “specific” managers are. In this context, we care more so about sensitivity. As such, the k-NN model would be the best fit. We finally explored whether there is a correlation or relationship between the days on the waitlist and the hotel’s Average Daily Revenue (ADR). We found a weak negative relationship between days on the waitlist and the ADR for city hotels and a weak positive relationship between the two variables for resort hotels. Next, we explored how often non-refund bookings canceled. The data showed that the “transient” customer type canceled an overwhelming number of bookings.

model_data_city = city_hotels2
start_date = as.Date("2015-07-01")
model_data_city$arrival_date <- as.numeric(difftime(model_data_city$arrival_date, start_date, unit = "days"))
xc <- cos(2*pi*model_data_city$arrival_date/365.25)
xs <- sin(2*pi*model_data_city$arrival_date/365.25)
fit.lm <- lm(avg_adr ~ xc + xs + arrival_date, data = model_data_city)
model_data_city$pred1 <- predict(fit.lm, model_data_city)
model.func = function(day){
  return(
    fit.lm$coefficients[[1]] + fit.lm$coefficients[[2]]*cos(2*pi*day/365.25) + fit.lm$coefficients[[3]]*sin(2*pi*day/365.25) + fit.lm$coefficients[[4]]*day
  )
}
p1 <- ggplot() + geom_point(data = model_data_city, aes(x = arrival_date, y = avg_adr)) + geom_hline(aes(yintercept=0))
p1.trend = p1  + 
  geom_line(data = model_data_city, aes(x = arrival_date, y = pred1), color="red", size = 2)
new = data.frame(arrival_date=793:1577)
#new$xc <- cos(2*pi*new$arrival_date/366)
#new$xs <- sin(2*pi*new$arrival_date/366)
#new$pred1 <- model.func(new)
model_data_city = bind_rows(model_data_city, new)
model_data_city$pred <- model.func(model_data_city$arrival_date)
model_data_city
## # A tibble: 1,570 x 5
##    arrival_date avg_length_of_stay avg_adr pred1  pred
##           <dbl>              <dbl>   <dbl> <dbl> <dbl>
##  1            0               2.09    96.6  95.8  95.8
##  2            1               3       58.7  95.9  95.9
##  3            2               2.5     74.5  96.1  96.1
##  4            3               4       63.8  96.2  96.2
##  5            5               1       66.1  96.5  96.5
##  6            6               6       69.2  96.6  96.6
##  7            7               2.27    64.8  96.7  96.7
##  8            8               5       58.9  96.8  96.8
##  9            9               4.5     93.3  96.9  96.9
## 10           10               2.74    87.1  97.1  97.1
## # … with 1,560 more rows
p1.trend + geom_line(data = model_data_city, aes(x = arrival_date, y = pred)) + geom_vline(xintercept = 792)

#model_data_city = model_data_city %>% select(arrival_date, avg_adr)
#predict(lm(avg_adr ~ xc + xs + model_data_city$arrival_date, data = model_data_city), newdata = pred)
#p1 +
  #geom_line(color="blue", data=new) + geom_vline(xintercept = 792)



model_data_resort = resort_hotels2
model_data_resort$arrival_date <- as.numeric(difftime(model_data_resort$arrival_date, start_date, unit = "days"))
NumDays.resort <- model_data_resort$arrival_date
xc <- cos(2*pi*model_data_resort$arrival_date/365.25)
xs <- sin(2*pi*model_data_resort$arrival_date/365.25)
fit.lm <- lm(avg_adr ~ xc + xs + arrival_date, data = model_data_resort)
model_data_resort$pred1 <- predict(fit.lm, model_data_resort)
p2 <- ggplot() + geom_point(data =model_data_resort, aes(x = arrival_date, y = avg_adr)) + geom_hline(aes(yintercept=0))
p2.trend = p2  + 
  geom_line(data = model_data_resort, aes(x = arrival_date, y = pred1), color="red", size = 2)
new = data.frame(arrival_date=793:1577)
model_data_resort = bind_rows(model_data_resort, new)
model_data_resort$pred <- model.func(model_data_resort$arrival_date)
model_data_resort
## # A tibble: 1,578 x 5
##    arrival_date avg_length_of_stay avg_adr pred1  pred
##           <dbl>              <dbl>   <dbl> <dbl> <dbl>
##  1            0               4.94    93.2  117.  117.
##  2            1               5.77   100.   117.  117.
##  3            2               5      109.   118.  118.
##  4            3               5.77    97.6  118.  118.
##  5            4               6.11   108.   118.  118.
##  6            5               6.26   112.   119.  119.
##  7            6               5.08   121.   119.  119.
##  8            7               5.95   105.   120.  120.
##  9            8               4.43   101.   120.  120.
## 10            9               5.56   124.   121.  121.
## # … with 1,568 more rows
p2.trend + geom_line(data = model_data_resort, aes(x = arrival_date, y = pred)) + geom_vline(xintercept = 792)